Source code for nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import re
import string
from typing import Set, Dict

from nlp_architect.utils.string_utils import StringUtils

PART_NAME_CATEGORIES = ['name', 'given name', 'surname']
DISAMBIGUATION_TITLE = '(disambiguation)'
DISAMBIGUATION_CATEGORY = ['disambig', 'disambiguation']


[docs]class WikipediaPageExtractedRelations(object): def __init__(self, is_part_name: bool = False, is_disambiguation: bool = False, parenthesis: Set[str] = None, disambiguation_links: Set[str] = None, categories: Set[str] = None, aliases: Set[str] = None, be_comp: Set[str] = None, disambiguation_links_norm: Set[str] = None, categories_norm: Set[str] = None, aliases_norm: Set[str] = None, title_parenthesis_norm: Set[str] = None, be_comp_norm: Set[str] = None) -> None: """ Object represent a Wikipedia Relations Schema Args: is_part_name (bool): Weather page title is part of a Name (ie-family name/given name..) is_disambiguation (bool): Weather page is a disambiguation page parenthesis (set): a set of all parenthesis links/titles disambiguation_links (set): a set of all disambiguation links/titles categories (set): a set of all category links/titles aliases (set): a set of all aliases links/titles be_comp (set): a set of all "is a" links/titles disambiguation_links_norm (set): same as disambiguation_link just normalized categories_norm (set): same as categories just normalized, lower and clean aliases_norm (set): same as aliases just normalized, lower and clean title_parenthesis_norm (set): same as parenthesis just normalized, lower and clean be_comp_norm (set): same as be_comp just normalized, lower and clean """ self.is_part_name = is_part_name self.is_disambiguation = is_disambiguation self.disambiguation_links = disambiguation_links self.title_parenthesis = parenthesis self.categories = categories self.aliases = aliases self.be_comp = be_comp self.disambiguation_links_norm = disambiguation_links_norm self.categories_norm = categories_norm self.aliases_norm = aliases_norm self.title_parenthesis_norm = title_parenthesis_norm self.be_comp_norm = be_comp_norm
[docs] def extract_relations_from_text_v0(self, text): self.disambiguation_links = set() self.categories = set() self.title_parenthesis = set() self.disambiguation_links_norm = set() self.categories_norm = set() self.title_parenthesis_norm = set() self.be_comp_norm = set() ext_links = set() title_parenthesis = set() text_lines = text.split('\n') for line in text_lines: cat_links = self.extract_categories(line) if not self.is_part_name: self.is_part_name = self.is_name_part(line) if not self.is_part_name and [s for s in PART_NAME_CATEGORIES if s in cat_links]: self.is_part_name = True self.categories.update(cat_links) self.categories_norm.update(StringUtils.normalize_string_list(cat_links)) links, parenthesis_links = self.extract_links_and_parenthesis(line) ext_links.update(links) title_parenthesis.update(parenthesis_links) if self.is_disambiguation: self.disambiguation_links = ext_links self.disambiguation_links_norm = StringUtils.normalize_string_list(ext_links) self.title_parenthesis = title_parenthesis self.title_parenthesis_norm = StringUtils.normalize_string_list(title_parenthesis)
def __str__(self) -> str: return str(self.is_disambiguation) + ', ' + str(self.is_part_name) + ', ' + \ str(self.disambiguation_links) + ', ' + str(self.be_comp) + ', ' + str( self.title_parenthesis) + ', ' + str(self.categories)
[docs] def toJson(self) -> Dict: result_dict = dict() result_dict['isPartName'] = self.is_part_name result_dict['isDisambiguation'] = self.is_disambiguation if self.disambiguation_links is not None: result_dict['disambiguationLinks'] = list(self.disambiguation_links) result_dict['disambiguationLinksNorm'] = list(self.disambiguation_links_norm) if self.categories is not None: result_dict['categories'] = list(self.categories) result_dict['categoriesNorm'] = list(self.categories_norm) if self.aliases is not None: result_dict['aliases'] = list(self.aliases) if self.title_parenthesis is not None: result_dict['titleParenthesis'] = list(self.title_parenthesis) result_dict['titleParenthesisNorm'] = list(self.title_parenthesis_norm) if self.be_comp_norm is not None: result_dict['beCompRelations'] = list(self.be_comp) result_dict['beCompRelationsNorm'] = list(self.be_comp_norm) return result_dict
[docs] @staticmethod def extract_categories(line: str) -> Set[str]: categories = set() category_form1 = re.findall(r'\[\[Category:(.*)\]\]', line) for cat in category_form1: if DISAMBIGUATION_TITLE in cat: cat = cat.replace(DISAMBIGUATION_TITLE, '') categories.add(cat) prog = re.search('^{{(disambig.*|Disambig.*)}}$', line) if prog is not None: category_form2 = prog.group(1) cats = category_form2.split('|') categories.update(cats) return categories
[docs] @staticmethod def is_name_part(line: str) -> bool: line = line.lower() val = False if WikipediaPageExtractedRelations.find_in_line(line, '===as surname==='): val = True elif WikipediaPageExtractedRelations.find_in_line(line, '===as given name==='): val = True elif WikipediaPageExtractedRelations.find_in_line(line, '===given names==='): val = True elif WikipediaPageExtractedRelations.find_in_line(line, '==as a surname=='): val = True elif WikipediaPageExtractedRelations.find_in_line(line, '==people with the surname=='): val = True elif WikipediaPageExtractedRelations.find_in_line(line, '==family name and surname=='): val = True elif WikipediaPageExtractedRelations.find_in_line(line, 'category:given names'): val = True elif WikipediaPageExtractedRelations.find_in_line(line, '{{given name}}'): val = True return val
[docs] @staticmethod def find_in_line(text: str, pattern: str) -> bool: found = re.findall(pattern, text) if found: return True return False